{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "695571"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('dumping-instagram-6-july-2019.json') as fopen:\n",
    "    instagram = json.load(fopen)\n",
    "    \n",
    "len(instagram)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Roti Jala 10rb/pck #rotijala #nomnommedan #kulinermedan #rotijalamedan #rotijalakari #rotijalakarimedan'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "instagram[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6597867"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('dumping-twitter-6-july-2019.json') as fopen:\n",
    "    twitter = json.load(fopen)\n",
    "    \n",
    "len(twitter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "twitter = twitter + instagram"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 455839/455839 [00:31<00:00, 14661.12it/s]\n",
      "100%|██████████| 14/14 [00:00<00:00, 1681.47it/s]0.11it/s]\n",
      "100%|██████████| 455839/455839 [00:31<00:00, 14567.49it/s]\n",
      "100%|██████████| 455839/455839 [00:30<00:00, 15061.55it/s]\n",
      "100%|██████████| 455839/455839 [00:32<00:00, 14001.83it/s]\n",
      "100%|██████████| 455839/455839 [00:32<00:00, 14137.38it/s]\n",
      " 55%|█████▌    | 250871/455839 [00:28<01:24, 2415.51it/s]]\n",
      "100%|██████████| 455839/455839 [00:32<00:00, 14165.18it/s]\n",
      "100%|██████████| 455839/455839 [00:31<00:00, 14426.20it/s]\n",
      "100%|██████████| 455839/455839 [00:30<00:00, 14789.09it/s]\n",
      "100%|██████████| 455839/455839 [00:30<00:00, 14945.50it/s]\n",
      "100%|██████████| 455839/455839 [00:31<00:00, 14436.54it/s]\n",
      "100%|██████████| 455839/455839 [00:30<00:00, 14832.70it/s]\n",
      "100%|██████████| 455839/455839 [00:31<00:00, 14342.68it/s]\n",
      "100%|██████████| 455839/455839 [00:31<00:00, 14460.33it/s]\n",
      "100%|██████████| 455839/455839 [01:41<00:00, 4496.72it/s]\n",
      "100%|██████████| 455839/455839 [02:45<00:00, 2753.56it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 9.26 s, sys: 7.28 s, total: 16.5 s\n",
      "Wall time: 2min 55s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "twitter = cleaning.multiprocessing(twitter, cleaning.cleaning_strings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 5.63 s, sys: 2.82 s, total: 8.44 s\n",
      "Wall time: 16 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "temp_vocab = list(set(cleaning.multiprocessing(twitter, cleaning.unique_words)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "572032\n",
      "CPU times: user 3.45 s, sys: 1.16 s, total: 4.62 s\n",
      "Wall time: 9.84 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.duplicate_dots_marks_exclamations, list_mode = False)\n",
    "print(len(temp_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 455839/455839 [00:02<00:00, 155991.85it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 156895.45it/s]\n",
      " 79%|███████▊  | 357949/455839 [00:02<00:00, 164568.18it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 153070.23it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 153701.90it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 154252.90it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 151358.90it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 159892.82it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 141636.74it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 150624.82it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 151594.67it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 153504.06it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 160638.72it/s]\n",
      "100%|██████████| 14/14 [00:00<00:00, 18997.17it/s]2.27it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 149462.36it/s]\n",
      "100%|██████████| 455839/455839 [00:08<00:00, 55337.88it/s]\n",
      "100%|██████████| 455839/455839 [00:12<00:00, 36145.06it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 12 s, sys: 6.25 s, total: 18.2 s\n",
      "Wall time: 30.6 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "twitter = cleaning.multiprocessing_multiple(twitter, temp_dict, cleaning.string_dict_cleaning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1153\n",
      "CPU times: user 1.34 s, sys: 1.16 s, total: 2.5 s\n",
      "Wall time: 3.29 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.remove_underscore, list_mode = False)\n",
    "print(len(temp_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 455839/455839 [00:02<00:00, 169344.74it/s]\n",
      " 75%|███████▍  | 340674/455839 [00:02<00:00, 130332.05it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 158372.23it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 156364.07it/s]\n",
      "100%|██████████| 14/14 [00:00<00:00, 18355.82it/s]6.70it/s]\n",
      " 75%|███████▍  | 339636/455839 [00:02<00:00, 149730.58it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 163305.38it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 161132.15it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 162207.19it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 159552.30it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 157458.91it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 152056.18it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 177201.75it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 161927.04it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 162742.98it/s]\n",
      "100%|██████████| 455839/455839 [00:07<00:00, 63489.93it/s]\n",
      "100%|██████████| 455839/455839 [00:11<00:00, 39318.40it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 5.21 s, sys: 4.03 s, total: 9.24 s\n",
      "Wall time: 19.4 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "twitter = cleaning.multiprocessing_multiple(twitter, temp_dict, cleaning.string_dict_cleaning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "522\n",
      "CPU times: user 1.27 s, sys: 1.31 s, total: 2.58 s\n",
      "Wall time: 3.5 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.isolate_spamchars, list_mode = False)\n",
    "print(len(temp_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 455839/455839 [00:02<00:00, 159378.15it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 173064.79it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 155694.64it/s]\n",
      " 28%|██▊       | 127248/455839 [00:00<00:01, 181827.03it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 167579.79it/s]\n",
      "100%|██████████| 14/14 [00:00<00:00, 13492.71it/s]6.72it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 135969.00it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 135719.03it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 143408.56it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 139453.06it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 178955.79it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 128991.72it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 179434.99it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 168179.77it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 173186.88it/s]\n",
      "100%|██████████| 455839/455839 [00:06<00:00, 66099.76it/s]\n",
      "100%|██████████| 455839/455839 [00:11<00:00, 41384.45it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 5.79 s, sys: 4.44 s, total: 10.2 s\n",
      "Wall time: 21 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "twitter = cleaning.multiprocessing_multiple(twitter, temp_dict, cleaning.string_dict_cleaning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "115961\n",
      "CPU times: user 1.74 s, sys: 1.28 s, total: 3.01 s\n",
      "Wall time: 3.28 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.break_short_words, list_mode = False)\n",
    "print(len(temp_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 455839/455839 [00:03<00:00, 151354.67it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 148338.35it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 156502.51it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 165163.81it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 149469.78it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 148194.20it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 141669.61it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 156665.80it/s]\n",
      "100%|██████████| 14/14 [00:00<00:00, 22901.82it/s]4.07it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 140620.26it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 149549.53it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 147115.77it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 139904.16it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 148630.39it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 148127.69it/s]\n",
      "100%|██████████| 455839/455839 [00:07<00:00, 58242.28it/s]\n",
      "100%|██████████| 455839/455839 [00:12<00:00, 36586.53it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 6.06 s, sys: 4.65 s, total: 10.7 s\n",
      "Wall time: 22.8 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "twitter = cleaning.multiprocessing_multiple(twitter, temp_dict, cleaning.string_dict_cleaning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "17506\n",
      "CPU times: user 1.29 s, sys: 1.37 s, total: 2.66 s\n",
      "Wall time: 2.88 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.break_long_words, list_mode = False)\n",
    "print(len(temp_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 455839/455839 [00:02<00:00, 164593.57it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 165002.34it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 151833.64it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 159067.88it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 151473.48it/s]\n",
      " 57%|█████▋    | 259172/455839 [00:01<00:01, 149361.78it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 161196.13it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 146570.87it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 152004.90it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 159428.75it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 161663.75it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 150068.62it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 159090.59it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 149547.48it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 149581.69it/s]\n",
      "100%|██████████| 455839/455839 [00:07<00:00, 59489.51it/s]\n",
      "100%|██████████| 455839/455839 [00:11<00:00, 39998.73it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 5.39 s, sys: 4.25 s, total: 9.65 s\n",
      "Wall time: 19.8 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "twitter = cleaning.multiprocessing_multiple(twitter, temp_dict, cleaning.string_dict_cleaning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4245\n",
      "CPU times: user 1.31 s, sys: 1.33 s, total: 2.64 s\n",
      "Wall time: 2.88 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.remove_ending_underscore, list_mode = False)\n",
    "print(len(temp_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 455839/455839 [00:02<00:00, 163182.98it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 168575.53it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 170859.16it/s]\n",
      " 36%|███▌      | 161851/455839 [00:01<00:01, 160596.22it/s]\n",
      " 65%|██████▍   | 294237/455839 [00:01<00:00, 162602.07it/s]\n",
      "100%|██████████| 14/14 [00:00<00:00, 15896.12it/s]7.03it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 157896.83it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 156744.19it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 160835.24it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 155500.07it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 162520.16it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 166561.50it/s]\n",
      "  8%|▊         | 38510/455839 [00:00<00:09, 43046.63it/s]s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 169451.08it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 182452.20it/s]\n",
      "100%|██████████| 455839/455839 [00:07<00:00, 63594.30it/s] \n",
      "100%|██████████| 455839/455839 [00:11<00:00, 38814.49it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 5.36 s, sys: 4.11 s, total: 9.47 s\n",
      "Wall time: 20.2 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "twitter = cleaning.multiprocessing_multiple(twitter, temp_dict, cleaning.string_dict_cleaning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3474\n",
      "CPU times: user 1.31 s, sys: 1.49 s, total: 2.79 s\n",
      "Wall time: 3.05 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.remove_starting_underscore, list_mode = False)\n",
    "print(len(temp_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 455839/455839 [00:02<00:00, 170447.52it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 169189.68it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 153416.15it/s]\n",
      " 63%|██████▎   | 287886/455839 [00:01<00:01, 156272.75it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 156239.15it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 165052.07it/s]\n",
      "100%|██████████| 14/14 [00:00<00:00, 17783.24it/s]9.97it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 156046.64it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 149192.90it/s]\n",
      "  3%|▎         | 12419/455839 [00:00<00:14, 31451.86it/s]s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 151851.15it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 162261.27it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 167873.46it/s]\n",
      " 54%|█████▍    | 248334/455839 [00:02<00:03, 61527.21it/s]]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 172733.36it/s]\n",
      "100%|██████████| 455839/455839 [00:07<00:00, 59714.57it/s]\n",
      "100%|██████████| 455839/455839 [00:12<00:00, 37384.26it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 5.48 s, sys: 3.95 s, total: 9.43 s\n",
      "Wall time: 20.9 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "twitter = cleaning.multiprocessing_multiple(twitter, temp_dict, cleaning.string_dict_cleaning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1384596\n",
      "CPU times: user 7.34 s, sys: 1.48 s, total: 8.81 s\n",
      "Wall time: 9 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.end_punct, list_mode = False)\n",
    "print(len(temp_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 455839/455839 [00:03<00:00, 138718.18it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 131907.40it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 137845.59it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 132396.93it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 136418.82it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 136389.88it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 126891.56it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 126748.60it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 141970.74it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 145347.13it/s]\n",
      " 59%|█████▉    | 268321/455839 [00:01<00:01, 135585.84it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 130642.08it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 149730.18it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 130984.84it/s]\n",
      "100%|██████████| 14/14 [00:00<00:00, 19443.79it/s]29it/s]] \n",
      "100%|██████████| 455839/455839 [00:09<00:00, 46788.59it/s]\n",
      "100%|██████████| 455839/455839 [00:15<00:00, 30016.77it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 21.4 s, sys: 7.28 s, total: 28.7 s\n",
      "Wall time: 43.6 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "twitter = cleaning.multiprocessing_multiple(twitter, temp_dict, cleaning.string_dict_cleaning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "318358\n",
      "CPU times: user 2.62 s, sys: 1.33 s, total: 3.95 s\n",
      "Wall time: 4.15 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.start_punct, list_mode = False)\n",
    "print(len(temp_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 31%|███▏      | 142871/455839 [00:01<00:02, 135295.91it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 127220.06it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 130544.73it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 135274.67it/s]\n",
      " 61%|██████    | 277568/455839 [00:02<00:01, 138755.17it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 138831.90it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 144405.82it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 136499.34it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 134813.85it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 142156.88it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 136567.03it/s]\n",
      "100%|██████████| 14/14 [00:00<00:00, 17497.10it/s]5.74it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 138935.34it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 134702.37it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 145935.46it/s]\n",
      "100%|██████████| 455839/455839 [00:08<00:00, 51486.39it/s]\n",
      "100%|██████████| 455839/455839 [00:14<00:00, 31609.05it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 8.71 s, sys: 4.39 s, total: 13.1 s\n",
      "Wall time: 27.2 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "twitter = cleaning.multiprocessing_multiple(twitter, temp_dict, cleaning.string_dict_cleaning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3520\n",
      "CPU times: user 1.28 s, sys: 1.4 s, total: 2.67 s\n",
      "Wall time: 3.18 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.join_dashes, list_mode = False)\n",
    "print(len(temp_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 74%|███████▍  | 336335/455839 [00:02<00:00, 154102.11it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 146850.56it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 150827.47it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 151237.63it/s]\n",
      " 39%|███▉      | 179721/455839 [00:01<00:02, 108220.65it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 150338.22it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 154143.62it/s]\n",
      " 51%|█████▏    | 234140/455839 [00:01<00:01, 139591.71it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 156455.49it/s]\n",
      "100%|██████████| 455839/455839 [00:02<00:00, 152180.14it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 145825.56it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 149500.08it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 148538.33it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 151020.13it/s]\n",
      "100%|██████████| 455839/455839 [00:03<00:00, 150990.48it/s]\n",
      "100%|██████████| 455839/455839 [00:07<00:00, 59330.64it/s]\n",
      "100%|██████████| 455839/455839 [00:12<00:00, 36762.27it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 5.24 s, sys: 4.13 s, total: 9.37 s\n",
      "Wall time: 20.6 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "twitter = cleaning.multiprocessing_multiple(twitter, temp_dict, cleaning.string_dict_cleaning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['ternyata kl lg sdih bisa ngasilin makanan enak',\n",
       " 'Abu kampret . . .',\n",
       " 'Bapa saya suka pake Oppo . . . saya sukanya Nokia . . . Kaka saya sukanya Samsung . . . yg penting punya hape aja . . .',\n",
       " 'Ngelamar kasih cincin tp kok mukanya songong ya sedih gue liatnya',\n",
       " 'Caption iki nggarai uwong males nikah min , ya kali manusia arep punah ngunu neg gak nikah . . .',\n",
       " 'Pertanyaannya sederhana , jika kami memang dukung Prabowo ngapain selama kampanye kemarin capek2 dukung Jokowi sampa',\n",
       " '',\n",
       " 'Memiliki sedikit iman lebih berharga dari pada memiliki segudang emas !',\n",
       " 'Untuk mengamankan suara partai , Ahmad Rofiq selaku Sekjen Partai Perindo meminta kepada seluruh caleg dan struktur',\n",
       " 'Dom jakpus sih , bebas mau ketemuan or shopee',\n",
       " 'Bisa dapet duit , ini kaga . Punya mobil juga kan kaya gemer gemer . Ini kaga',\n",
       " 'ON JADAHNYAAA * IN SORRY BAD ENGLISH HIHUHEHEHO',\n",
       " 'Valentino Rossi Tidak Setuju Kompetisi MotoGP Dimulai dari Eropa',\n",
       " 'Sis tak faham , apa yang mungkin ini puncanya tu ? ? ?',\n",
       " '\" Martabak Terang Bulan \" \" Martabak \" untuk yg asin / gurih a.k.a martabak telor \" Terang Bulan \" untuk yg manis ( yg gw s',\n",
       " 'Dia dah tua put , dah nak 31 . Plus dia tak start regularly kat man utd so mesti ka',\n",
       " 'sejarah susah',\n",
       " 'Loop in nama dlm email pon boleh jd issue . . . Dah org email aku reply all jelaaa . . . Ade mase pulak aku nak tengok satu2 nama recipients',\n",
       " 'tak sakit pun tapi saja nak bau minyak freshcare sbb bau lavender',\n",
       " 'ROSMAH',\n",
       " 'Bila kau tengah feeling lagu raya .',\n",
       " 'Kekasih bayangan .',\n",
       " 'Hidup ni jgn terlalu nk mendongak ke atas , nanti jatuh padan muka kau',\n",
       " 'Pak kim toloong . . . @BTS_twt',\n",
       " 'Di rumah ga liat pohon kelapa sama nanas kan apalagi pohon pisang',\n",
       " 'wkwk',\n",
       " 'kanan sja bu',\n",
       " 'Tak pon sebelum masuk dapur bagi salam dulu . . . kan molek gitu . . .',\n",
       " 'Hilang nyawaku aku tgk',\n",
       " 'Masuk ke channel bang evan ke ni',\n",
       " 'yg minat saya pon bole lekk',\n",
       " 'yer lah sbb sombong mmg lah',\n",
       " 'nti aku tengok dulu tiket dari kl pukul berapa ada nahh',\n",
       " 'Ni pukul berapa tah nak sampai ukm . Tetiba jalan tutup pulak . Kena lalu jalan jauh . . .',\n",
       " 'Tkpe , hehe asalkan effort ada',\n",
       " 'Kenapa kipas number 3 pun sejuk . Kalau bilik aku sorang ni , aku dah tutup .',\n",
       " 'Google Cabut Lisensi Android Huawei , Bagaimana Nasib Honor ? - - Tekno',\n",
       " 'Dari semalam tak tidur lagi ek . Ni kejap lagi jangan leceh nampak tilam bantal confirm nyenyak punya',\n",
       " 'tidur di ubin . biar ga jatoh lg',\n",
       " 'Guys , tolong rt tweet ni sampai owner dia dapat . Phone ni tertinggal kat belakang teksi pakcik saya . Model Oppo R9s',\n",
       " 'Jujur kacang ijo ! ! !',\n",
       " 'Sahur tengah malam kaya nya enak ya . . .',\n",
       " 'Jenis-jenis orang stalking di media sosial : - Pakai akun palsu . - Pakai akun temannya , sanak saudaranya , handai tau',\n",
       " 'Benersi ga buka sm yg minyak2 aka gorengan &amp ; makan nasi . Tapi abis pudding setengah lingkaran , makannya mi trs ishy',\n",
       " 'Hahahahha bahaya bela kucing comel2 ni sebab nnti hilang kena curi',\n",
       " 'Aku ada motor racing , aku bawa ronda , awek lu bonceng , sedar sedar seluat tkde dah Punca mat rempit takboleh rap',\n",
       " 'Pak prabowo itu vibesnya kebun binatang banget ya Peliharaannya kucing , sukanya naek kuda , kemana-mana pake baju safari',\n",
       " \"Nak happiness bkn pegih ngn laki lain , happiness it's between u and me , bedek uh kau ckp takda happiness , ss sem\",\n",
       " 'Makan serabi enak pas lagi panas . . . Serabinya terbuat dari kelapa . . . neng Tasya aa ikhlas . . .',\n",
       " 'Siapapun orangnya meski dia ustadz bersorban dan berjubah putih , klo sdh k',\n",
       " 'loh kenapa ? kan marga oppa juga lee , pasti enak yaudah oppa jalan - jalan dong , biar bisa liat pemandangan',\n",
       " 'Ada apa yaa mbak mbak plat AG INII',\n",
       " 'kanan',\n",
       " 'Nak mee kari , nak sate , nak laksa , nak bihun sup nakkkkk semuaaaaa',\n",
       " 'jadi lumba lumba',\n",
       " '- STILL 17 - SEDIHBGT ! ! ! Kebayang kan betapa sedih lu gak tau gimana lu di waktu 18 tahun,19tahun 20thn dan seteru',\n",
       " 'Iyaaa , gue di Hima 2 periode ditambah malamnya gue rapat atau latihan ukm . Jadi kalo mau nongkrong bisanya jam 11 keatas .',\n",
       " 'Bangga manfaat \"DILAN \" perputaran uang yg mendukung pertumbuhan ekonomi mikro-makro,mengurangi pengangguran',\n",
       " 'drpd lahir sampai sekarang aku asyik ngantuk je',\n",
       " 'pgn chatime xixi tp jauh :(',\n",
       " 'Dah tak kasi lampu ijo loh . Tinggal pepet to cuk hwhw',\n",
       " 'kecewa . . .',\n",
       " 'Batok kelapa menjadi bara , Terbakar semua tidak tersisa . Wahai saudara seiman senegara , Saya ucapakan selamat puas',\n",
       " 'JY 91 Liner jgak ke ?',\n",
       " 'Ada benda mcm kotor mcm air susu atas kereta mcm ada org campak . Mula2 ingat mcm taik burung . Tp lain mcm',\n",
       " 'Air koroi',\n",
       " 'Ilmu perpustakaan . Point2 kuliah , ttg manajemen perpustakaan , literasi , informasi , teknologi informasi',\n",
       " 'Ajax spurs lah . Anti menstrim',\n",
       " 'abis telan biji durian kali',\n",
       " 'apaan rambut item . . .',\n",
       " 'Senin , 22 April 2019 kita memperingati hari Bumi . Bumi kita saat ini lagi menjerit kesakitan karena dirusak untuk m',\n",
       " 'Gaya hidup sihat delayed',\n",
       " 'Lia pulang , mereka semuanya pedo kecuali aku , jangan mau .',\n",
       " 'Bangun lambat . Lepas tu jalan jem gile . Haihhhh so stressss',\n",
       " 'Nice igstory harini , dah tak nmpak org repost sudan meal project tu',\n",
       " 'Gone apa ? Gitu je laa . Sendu sorang',\n",
       " 'Bukan pola pikir seorang profesor hukum tapi cara berpikir seorang pedagang cendol',\n",
       " 'SobaTani , sebagai upaya meningkatkan generasi petani , Kementan membuat terobosan dengan mengubah Sekolah Tinggi Pen',\n",
       " 'Beomgyu ngambilin confetti yang nyangkut di rambut Jimin dong * Liat gini aja soft akutuh - Cha',\n",
       " 'Eh hello bosan tu sbb kau xmenghayati hahaha',\n",
       " 'Crash on 29 Lebuhraya Damansara Puchong - Putrajaya &amp ; Cyberjaya still delaying traffic 10m more than usual',\n",
       " 'WADUUH KAMU DENGERINNYA SAMBIL MINUM ? ? ?',\n",
       " 'Rasa-rasanya kalo lg gapunya duit gini , nemu duit recehan yang nyelip2 dikantong celana atau nemu duit kerincingan',\n",
       " 'anjing lagi having sex gitu kak ?',\n",
       " \"I'm at CSF Computer Exchange 5 (CX5 ) - in Cyberjaya , Selangor Darul Ehsan w /\",\n",
       " '( Clip 1)Percutian yang menarik haruslah dipadankan dengan tempat rare dan istimewa ! Berlatar belakangkan Gunung Santubong dan berhadapan dengan Laut China Selatan oh indahnya dunia . Jom follow instagram kami :',\n",
       " 'Alhamdulillah hari ni iftar Nasi Kerabu Ayam Madu Kak Yong n Laksam buat kali terakhir sebab kak Aini last da berju',\n",
       " 'saya udah sering banget ngadepin jalanan macet di jakarta tapi sejauh ini yg paling anjing sih semuanya',\n",
       " 'Hi , Baby baru bangun Baby emo .',\n",
       " 'Twitter please do ur magic Ini pertama kali nyah gua ngajak jalan dia karna selama bertahun tahun dia kuliah di j',\n",
       " 'Jum CUCKOO bersama NABIL AHMAD',\n",
       " 'Nikammy',\n",
       " 'Resort Datuk Jhon Gani . . . kuala penyu . . . boleh bawa keluarga . . . santai saja tempatnya . . . pantai nya bersih dan indah . . . tenan',\n",
       " 'Uni kenapa sistem Masuk sekolah ke tingkat lanjutan terlalu susah skr in',\n",
       " 'Krisis perlembagaan kedua bermula balik dgn orang sama dgn',\n",
       " \"Bagi saya , diusia 20an kata 'jahat ' bukan lagi sesederhana mainan yang dirampas atau buku pr yang dirobek teman . J\",\n",
       " 'Lapor . . . arah demak tersendat dari tambak lorok 17.11 wib . . . dan sekarang di terminal terboyo masih rendet .',\n",
       " '* KPH UJUNG TOMBAK PENDUKUNG VISI MISI GUBERNUR KALBAR * Dengan mengoptimalkan tugas dan fungsi pokok kesatuan Peman',\n",
       " 'aku sedih ni tak ada siapa nak hiburkan ke ?',\n",
       " 'gue baru bangun juga lagi males pergi mana rumah gue kek kapal pecah utg kaga main twitter tmn gue zwoakowka kalo ga udh diciduk']"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "twitter[:100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools\n",
    "import re\n",
    "from tqdm import tqdm\n",
    "\n",
    "_list_laughing = {\n",
    "    'huhu',\n",
    "    'haha',\n",
    "    'gaga',\n",
    "    'hihi',\n",
    "    'wkawka',\n",
    "    'wkwk',\n",
    "    'kiki',\n",
    "    'keke',\n",
    "    'huehue',\n",
    "}\n",
    "\n",
    "def last_cleaning(string):\n",
    "    string = re.sub(r'[ ]+', ' ', string.lower()).strip().split()\n",
    "    string = [\n",
    "        word\n",
    "        for word in string\n",
    "        if not any([laugh in word for laugh in _list_laughing])\n",
    "        and word[: len(word) // 2] != word[len(word) // 2 :]\n",
    "    ]\n",
    "    string = ' '.join(string)\n",
    "    string = (\n",
    "        ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))\n",
    "    )\n",
    "    return string\n",
    "\n",
    "def last_cleaning_strings(strings):\n",
    "    for i in tqdm(range(len(strings))):\n",
    "        strings[i] = last_cleaning(strings[i])\n",
    "    return strings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 16%|█▌        | 73148/455839 [00:26<02:00, 3181.25it/s]s]\n",
      "100%|██████████| 455839/455839 [00:30<00:00, 15103.34it/s]\n",
      "100%|██████████| 14/14 [00:00<00:00, 2914.01it/s]0.51it/s]\n",
      "100%|██████████| 455839/455839 [00:30<00:00, 15135.76it/s]\n",
      "100%|██████████| 455839/455839 [00:30<00:00, 15039.32it/s]\n",
      "100%|██████████| 455839/455839 [00:30<00:00, 15024.90it/s]\n",
      "100%|██████████| 455839/455839 [00:30<00:00, 15025.00it/s]\n",
      "100%|██████████| 455839/455839 [00:29<00:00, 15235.95it/s]\n",
      "100%|██████████| 455839/455839 [00:30<00:00, 14930.79it/s]\n",
      "100%|██████████| 455839/455839 [00:30<00:00, 14958.10it/s]\n",
      "100%|██████████| 455839/455839 [00:30<00:00, 15070.14it/s]\n",
      "100%|██████████| 455839/455839 [00:31<00:00, 14554.28it/s]\n",
      "100%|██████████| 455839/455839 [00:30<00:00, 14871.05it/s]\n",
      "100%|██████████| 455839/455839 [00:31<00:00, 14650.97it/s]\n",
      "100%|██████████| 455839/455839 [00:31<00:00, 14684.81it/s]\n",
      "100%|██████████| 455839/455839 [01:29<00:00, 5104.88it/s]\n",
      "100%|██████████| 455839/455839 [02:23<00:00, 3181.41it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 8.56 s, sys: 6.95 s, total: 15.5 s\n",
      "Wall time: 2min 30s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "twitter = cleaning.multiprocessing(twitter, last_cleaning_strings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['ternyata kl lg sdih bisa ngasilin makanan enak',\n",
       " 'abu kampret . . .',\n",
       " 'bapa saya suka pake oppo . . . saya sukanya nokia . . . saya sukanya samsung . . . yg penting punya hape aja . . .',\n",
       " 'ngelamar kasih tp kok mukanya songong ya sedih gue liatnya',\n",
       " 'caption iki nggarai uwong males nikah min , ya kali manusia arep punah ngunu neg gak nikah . . .',\n",
       " 'pertanyaannya sederhana , jika kami memang dukung prabowo ngapain selama kampanye kemarin capek2 dukung jokowi sampa',\n",
       " '',\n",
       " 'memiliki sedikit iman lebih berharga dari pada memiliki segudang emas !',\n",
       " 'untuk mengamankan suara partai , ahmad rofiq selaku sekjen partai perindo meminta kepada seluruh caleg dan struktur',\n",
       " 'dom jakpus sih , bebas mau ketemuan or shopee',\n",
       " 'bisa dapet duit , ini kaga . punya mobil juga kan kaya gemer gemer . ini kaga',\n",
       " 'on jadahnyaa * in sorry bad english hihuheheho',\n",
       " 'valentino rossi tidak setuju kompetisi motogp dimulai dari eropa',\n",
       " 'sis tak faham , apa yang mungkin ini puncanya tu ? ? ?',\n",
       " '\" martabak terang bulan \" \" martabak \" untuk yg asin / gurih a.k.a martabak telor \" terang bulan \" untuk yg manis ( yg gw s',\n",
       " 'dia dah tua put , dah nak 31 . plus dia tak start regularly kat man utd so mesti ka',\n",
       " 'sejarah susah',\n",
       " 'loop in nama dlm email pon boleh jd issue . . . dah org email aku reply all jelaa . . . ade mase pulak aku nak tengok satu2 nama recipients',\n",
       " 'tak sakit pun tapi saja nak bau minyak freshcare sbb bau lavender',\n",
       " 'rosmah',\n",
       " 'bila kau tengah feeling lagu raya .',\n",
       " 'kekasih bayangan .',\n",
       " 'hidup ni jgn terlalu nk mendongak ke atas , nanti jatuh padan muka kau',\n",
       " 'pak kim toloong . . . @bts_twt',\n",
       " 'di rumah ga liat pohon kelapa sama nanas kan apalagi pohon pisang',\n",
       " '',\n",
       " 'kanan sja bu',\n",
       " 'tak pon sebelum masuk dapur bagi salam dulu . . . kan molek gitu . . .',\n",
       " 'hilang nyawaku aku tgk',\n",
       " 'masuk ke channel bang evan ke ni',\n",
       " 'yg minat saya pon bole lekk',\n",
       " 'yer lah sbb sombong mmg lah',\n",
       " 'nti aku tengok dulu tiket dari kl pukul berapa ada nahh',\n",
       " 'ni pukul berapa tah nak sampai ukm . tetiba jalan tutup pulak . kena lalu jalan jauh . . .',\n",
       " 'tkpe , asalkan effort ada',\n",
       " 'kenapa kipas number 3 pun sejuk . kalau bilik aku sorang ni , aku dah tutup .',\n",
       " 'google cabut lisensi android huawei , bagaimana nasib honor ? - - tekno',\n",
       " 'dari semalam tak tidur lagi ek . ni kejap lagi jangan leceh nampak tilam bantal confirm nyenyak punya',\n",
       " 'tidur di ubin . biar ga jatoh lg',\n",
       " 'guys , tolong rt tweet ni sampai owner dia dapat . phone ni tertinggal kat belakang teksi pakcik saya . model oppo r9s',\n",
       " 'jujur kacang ijo ! ! !',\n",
       " 'sahur tengah malam kaya nya enak ya . . .',\n",
       " 'jenis-jenis orang stalking di media sosial : - pakai akun palsu . - pakai akun temannya , sanak saudaranya , handai tau',\n",
       " 'benersi ga buka sm yg minyak2 aka gorengan &amp ; makan nasi . tapi abis pudding setengah lingkaran , makannya mi trs ishy',\n",
       " 'bahaya bela kucing comel2 ni sebab nnti hilang kena curi',\n",
       " 'aku ada motor racing , aku bawa ronda , awek lu bonceng , sedar sedar seluat tkde dah punca mat rempit takboleh rap',\n",
       " 'pak prabowo itu vibesnya kebun binatang banget ya peliharaannya kucing , sukanya naek kuda , kemana-mana pake baju safari',\n",
       " \"nak happiness bkn pegih ngn laki lain , happiness it's between u and me , bedek uh kau ckp takda happiness , sem\",\n",
       " 'makan serabi enak pas lagi panas . . . serabinya terbuat dari kelapa . . . neng tasya ikhlas . . .',\n",
       " 'siapapun orangnya meski dia ustadz bersorban dan berjubah putih , klo sdh k',\n",
       " 'loh kenapa ? kan marga oppa juga lee , pasti enak yaudah oppa jalan - jalan dong , biar bisa liat pemandangan',\n",
       " 'ada apa yaa mbak mbak plat ag inii',\n",
       " 'kanan',\n",
       " 'nak mee kari , nak sate , nak laksa , nak bihun sup nakk semuaa',\n",
       " 'jadi lumba lumba',\n",
       " '- still 17 - sedihbgt ! ! ! kebayang kan betapa sedih lu gak tau gimana lu di waktu 18 tahun,19tahun 20thn dan seteru',\n",
       " 'iyaa , gue di hima 2 periode ditambah malamnya gue rapat atau latihan ukm . jadi kalo mau nongkrong bisanya jam keatas .',\n",
       " 'bangga manfaat \"dilan \" perputaran uang yg mendukung pertumbuhan ekonomi mikro-makro,mengurangi pengangguran',\n",
       " 'drpd lahir sampai sekarang aku asyik ngantuk je',\n",
       " 'pgn chatime tp jauh :(',\n",
       " 'dah tak kasi lampu ijo loh . tinggal pepet to cuk',\n",
       " 'kecewa . . .',\n",
       " 'batok kelapa menjadi bara , terbakar semua tidak tersisa . wahai saudara seiman senegara , saya ucapakan selamat puas',\n",
       " 'jy 91 liner jgak ke ?',\n",
       " 'ada benda mcm kotor mcm air atas kereta mcm ada org campak . mula2 ingat mcm taik burung . tp lain mcm',\n",
       " 'air koroi',\n",
       " 'ilmu perpustakaan . point2 kuliah , ttg manajemen perpustakaan , literasi , informasi , teknologi informasi',\n",
       " 'ajax spurs lah . anti menstrim',\n",
       " 'abis telan biji durian kali',\n",
       " 'apaan rambut item . . .',\n",
       " 'senin , april 2019 kita memperingati hari bumi . bumi kita saat ini lagi menjerit kesakitan karena dirusak untuk m',\n",
       " 'gaya hidup sihat delayed',\n",
       " 'lia pulang , mereka semuanya pedo kecuali aku , jangan mau .',\n",
       " 'bangun lambat . lepas tu jalan jem gile . haihh so stress',\n",
       " 'nice igstory harini , dah tak nmpak org repost sudan meal project tu',\n",
       " 'gone apa ? gitu je laa . sendu sorang',\n",
       " 'bukan pola pikir seorang profesor hukum tapi cara berpikir seorang pedagang cendol',\n",
       " 'sobatani , sebagai upaya meningkatkan generasi petani , kementan membuat terobosan dengan mengubah sekolah tinggi pen',\n",
       " 'beomgyu ngambilin confetti yang nyangkut di rambut jimin dong * liat gini aja soft akutuh - cha',\n",
       " 'eh hello bosan tu sbb kau xmenghayati',\n",
       " 'crash on 29 lebuhraya damansara puchong - putrajaya &amp ; cyberjaya still delaying traffic 10m more than usual',\n",
       " 'waduuh kamu dengerinnya sambil minum ? ? ?',\n",
       " 'rasa-rasanya kalo lg gapunya duit gini , nemu duit recehan yang nyelip2 dikantong celana atau nemu duit kerincingan',\n",
       " 'anjing lagi having sex gitu kak ?',\n",
       " \"i'm at csf computer exchange 5 (cx5 ) - in cyberjaya , selangor darul ehsan w /\",\n",
       " '( clip 1)percutian yang menarik haruslah dipadankan dengan tempat rare dan istimewa ! berlatar belakangkan gunung santubong dan berhadapan dengan laut china selatan oh indahnya dunia . jom follow instagram kami :',\n",
       " 'alhamdulillah hari ni iftar nasi kerabu ayam madu kak yong n laksam buat kali terakhir sebab kak aini last da berju',\n",
       " 'saya udah sering banget ngadepin jalanan macet di jakarta tapi sejauh ini yg paling anjing sih semuanya',\n",
       " 'hi , baby baru bangun baby emo .',\n",
       " 'twitter please do ur magic ini pertama kali nyah gua ngajak jalan dia karna selama bertahun tahun dia kuliah di j',\n",
       " 'jum cuckoo bersama nabil ahmad',\n",
       " 'nikammy',\n",
       " 'resort datuk jhon gani . . . kuala penyu . . . boleh bawa keluarga . . . santai saja tempatnya . . . pantai nya bersih dan indah . . . tenan',\n",
       " 'uni kenapa sistem masuk sekolah ke tingkat lanjutan terlalu susah skr in',\n",
       " 'krisis perlembagaan kedua bermula balik dgn orang sama dgn',\n",
       " \"bagi saya , diusia 20an kata 'jahat ' bukan lagi sesederhana mainan yang dirampas atau buku pr yang dirobek teman . j\",\n",
       " 'lapor . . . arah demak tersendat dari tambak lorok 17.11 wib . . . dan sekarang di terminal terboyo masih rendet .',\n",
       " '* kph ujung tombak pendukung visi misi gubernur kalbar * dengan mengoptimalkan tugas dan fungsi pokok kesatuan peman',\n",
       " 'aku sedih ni tak ada siapa nak hiburkan ke ?',\n",
       " 'gue baru bangun juga lagi males pergi mana rumah gue kek kapal pecah utg kaga main twitter tmn gue zwoakowka kalo ga udh diciduk']"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "twitter[:100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('ms-socialmedia.txt', 'w') as fopen:\n",
    "    fopen.write(' '.join(twitter))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
